Chicago Postgres User Group Meetup, January 13, 2020





Parfait Gasana

Data Analyst, Winston & Strawn
@Parfait (StackOverflow) |   @ParfaitG (GitHub)


Open Source Success Stories


Challenging Commercial Market Share







Relational Database Solution


Use Case: Environment Database

Human Impact on the Global and Local Biosphere and Climate



suppressMessages(library(kableExtra))
suppressMessages(library(xts))
seaborn_palette <- c("#4C72B0", "#DD8452", "#55A868", "#C44E52", "#8172B3", "#937860", 
                     "#DA8BC3", "#8C8C8C", "#CCB974", "#64B5CD", "#4C72B0", "#DD8452")

Connection

ODBC

library(DBI)
library(odbc)
conn <- dbConnect(odbc::odbc(), driver="PostgreSQL Unicode", 
                  server="localhost", database="environment",
                  uid="postgres", pwd="env19", port=6432)
dbListTables(conn)
 [1] "ag_census"             "arable_land"           "consumption"           "fws_species_count"     "fws_species_year"     
 [6] "global_mean_sea_level" "global_temperature"    "groundwater"           "iucn_species_count"    "ocean_data"           
[11] "plants_assessments"    "ppm_month"             "ppm_week"              "sea_ice_extent"        "us_co2_emissions"     
[16] "us_gdp"                "us_population"         "us_renewable_energy"   "world_co2_emissions"   "world_gdp"            
[21] "world_population"     
dbDisconnect(conn)

JDBC

library(rJava)
library(RJDBC)
drv <- JDBC("org.postgresql.Driver",
           "/usr/lib/jvm/java-8-oracle/lib/postgresql-42.2.2.jar")
conn <- dbConnect(drv, "jdbc:postgresql://localhost:6432/environment", "postgres", "env19")
dbListTables(conn, schema="public")
 [1] "ag_census_pkey"               "arable_land_pkey"             "co2_emissions_pkey"          
 [4] "consumption_pkey"             "fws_species_count_pkey"       "fws_species_year_pkey"       
 [7] "global_mean_sea_level_pkey"   "global_temperature_pkey"      "groundwater_pkey"            
[10] "iucn_species_count_pkey"      "ocean_data_pkey"              "plants_assessments_pkey"     
[13] "ppm_month_pkey"               "ppm_week_pkey"                "sea_ice_extent_pkey"         
[16] "us_gdp_pkey"                  "us_renewable_energy_pkey"     "usa_population_pkey"         
[19] "world_co2_emissions_pkey"     "world_gdp_pkey"               "world_population_pkey"       
[22] "ag_census_id_seq"             "arable_land_id_seq"           "co2_emissions_id_seq"        
[25] "consumption_id_seq"           "fws_species_count_id_seq"     "fws_species_year_id_seq"     
[28] "global_mean_sea_level_id_seq" "global_temperature_id_seq"    "groundwater_id_seq"          
[31] "iucn_species_count_id_seq"    "ocean_data_id_seq"            "plants_assessments_id_seq"   
[34] "ppm_month_id_seq"             "ppm_week_id_seq"              "sea_ice_extent_id_seq"       
[37] "us_gdp_id_seq"                "us_renewable_energy_id_seq"   "usa_population_id_seq"       
[40] "world_co2_emissions_id_seq"   "world_gdp_id_seq"             "world_population_id_seq"     
[43] "ag_census"                    "arable_land"                  "consumption"                 
[46] "fws_species_count"            "fws_species_year"             "global_mean_sea_level"       
[49] "global_temperature"           "groundwater"                  "iucn_species_count"          
[52] "ocean_data"                   "plants_assessments"           "ppm_month"                   
[55] "ppm_week"                     "sea_ice_extent"               "us_co2_emissions"            
[58] "us_gdp"                       "us_population"                "us_renewable_energy"         
[61] "world_co2_emissions"          "world_gdp"                    "world_population"            
dbDisconnect(conn)
[1] TRUE

R-Postgres API

library(RPostgreSQL)
conn <- dbConnect(RPostgreSQL::PostgreSQL(), host="localhost", dbname="environment",
                  user="postgres", password="env19", port=6432)
dbListTables(conn)
 [1] "ag_census"             "plants_assessments"    "us_gdp"                "us_renewable_energy"   "world_gdp"            
 [6] "world_co2_emissions"   "global_mean_sea_level" "groundwater"           "ocean_data"            "us_co2_emissions"     
[11] "us_population"         "ppm_month"             "ppm_week"              "consumption"           "fws_species_count"    
[16] "fws_species_year"      "iucn_species_count"    "global_temperature"    "sea_ice_extent"        "world_population"     
[21] "arable_land"          

Import & Manage Data

CREATE TABLE global_temperature (
   id SERIAL NOT NULL PRIMARY KEY,
   year INT,
   period VARCHAR(50),
   global_mean NUMERIC(5,2)
);

\copy global_temperature (year, period, global_mean) FROM '/home/parfaitg/Databases/SQL_Server/ENVIRONMENT/global_temperature.csv' DELIMITER ',' CSV HEADER;

COMMENT ON TABLE global_temperature IS 'Source: NASA - Combined Land-Surface Air and Sea-Surface Water Temperature Anomalies (Land-Ocean Temperature Index, LOTI) (https://data.giss.nasa.gov/gistemp/)';
SELECT relname as table, obj_description(oid) as comment
FROM pg_class
WHERE relkind = 'r'
  AND obj_description(oid) IS NOT NULL
  ORDER BY relname
kable_styling(kable(comments_df),
              bootstrap_options = c("striped", "hover"))
table comment
ag_census U.S. Department of Agriculture, Census Data Query Tool (CDQT): https://www.nass.usda.gov/Quick_Stats/CDQT/chapter/1/table/1
arable_land Source: World Bank - Arable land (% of land area) (https://data.worldbank.org/indicator/AG.LND.ARBL.ZS?end=2016&start=1961&view=chart)
consumption Source: EIA - Monthly Energy Review, publication of recent and historical energy statistics (https://www.eia.gov/totalenergy/data/monthly/)
fws_species_count Source: US FWS - U.S. Federal Endangered and Threatened Species (https://ecos.fws.gov/ecp/species-reports)
fws_species_year Source: US FWS - U.S. Federal Endangered and Threatened Species (https://ecos.fws.gov/ecp/species-reports)
global_mean_sea_level Source: NASA - Global Mean Sea Level (mm): https://climate.nasa.gov/vital-signs/sea-level/
global_temperature Source: NASA - Combined Land-Surface Air and Sea-Surface Water Temperature Anomalies (Land-Ocean Temperature Index, LOTI) (https://data.giss.nasa.gov/gistemp/)
groundwater U.S. Geological Survey (USGS) Groundwater-Level Annual Statistics for the Nation: https://waterdata.usgs.gov/nwis/annual?referred_module=gw&search_criteria=site_tp_cd&submitted_form=introduction
iucn_species_count Source: IUCN - Red List of Threatened Species (https://www.iucnredlist.org/resources/summary-statistics#Summary%20Tables)
ocean_data Source: NOAA - Global Ocean Data Analysis Project (https://data.nodc.noaa.gov/cgi-bin/iso?id=gov.noaa.nodc:0162565#)
plants_assessments Source: Botanic Gardens Conservation International - Plant Threat Search: https://tools.bgci.org/threat_search.php
ppm_month Source: NOAA - Mauna Loa CO2 records Carbon PPM (https://www.esrl.noaa.gov/gmd/ccgg/trends/data.html)
ppm_week Source: NOAA - Mauna Loa CO2 records Carbon PPM (https://www.esrl.noaa.gov/gmd/ccgg/trends/data.html)
sea_ice_extent Source: NSIDC - Sea Ice Index Data (https://nsidc.org/data/seaice_index/archives)
us_co2_emissions Source: EIA - Monthly Energy Review, publication of recent and historical energy statistics (https://www.eia.gov/totalenergy/data/monthly/)
us_gdp Source: Federal Reserve Bank of St. Louis - US Real Gross Domestic Product by Industry: https://fred.stlouisfed.org/categories/33045
us_population Source: Federal Reserve Bank of St. Louis - U.S. Monthly Population (https://fred.stlouisfed.org/series/POPTHM)
us_renewable_energy Source: EIA - Renewable Energy Production and Consumption by Source: https://www.eia.gov/totalenergy/data/monthly/
world_co2_emissions Source: World Bank - World CO2 emissions (kt): https://data.worldbank.org/indicator/EN.ATM.CO2E.KT?end=2014&start=1960&view=chart
world_gdp Source: World Bank - World GDP (Current US$): https://data.worldbank.org/indicator/NY.GDP.MKTP.CD
world_population Source: World Bank - Arable land (% of land area) (https://data.worldbank.org/indicator/AG.LND.ARBL.ZS?end=2016&start=1961&view=chart)

Analyze Data

Department of Energy’s EIA: Monthly Energy Review &

NOAA Mauna Loa, Hawaii Observatory - Carbon PPM

SELECT p.date_year, p.date_month, CONCAT(p.date_year, '-', p.date_month, '-01')::date as date_day,
               p.average_ppm as "carbon ppm", c.energy_consumed as "btu consumed", e.energy_co2 as "co2 emissions"
  FROM ppm_month p
  JOIN consumption c 
    ON p.date_year = c.date_year AND p.date_month = c.date_month and c.msn = 'TXRCBUS'
  JOIN us_co2_emissions e
    ON p.date_year = e.date_year AND p.date_month = e.date_month and e.msn = 'TETCEUS'
  ORDER BY p.date_year, p.date_month
SELECT p.date_year,
       sum(p.average_ppm) as carbon_ppm_total, 
       avg(p.average_ppm) as carbon_ppm_mean, 
       sum(c.energy_consumed) as btu_consumed_total,
       avg(c.energy_consumed) as btu_consumed_mean,
       sum(e.energy_co2) as co2_emissions_total,
       avg(e.energy_co2) as co2_emissions_mean
  FROM ppm_month p
  JOIN consumption c 
    ON p.date_year = c.date_year AND p.date_month = c.date_month and c.msn = 'TXRCBUS'
  JOIN us_co2_emissions e
    ON p.date_year = e.date_year AND p.date_month = e.date_month and e.msn = 'TETCEUS'
GROUP BY p.date_year
ORDER BY p.date_year
kable_styling(kable(tail(agg_df, 10)),
              bootstrap_options = c("striped", "hover"))
date_year carbon_ppm_total carbon_ppm_mean btu_consumed_total btu_consumed_mean co2_emissions_total co2_emissions_mean
38 2010 4678.79 389.8992 6641.355 553.4462 5585.741 465.4784
39 2011 4699.83 391.6525 6473.666 539.4722 5446.133 453.8444
40 2012 4726.24 393.8533 5684.503 473.7086 5237.300 436.4417
41 2013 4758.25 396.5208 6689.368 557.4473 5363.018 446.9182
42 2014 4783.77 398.6475 7007.139 583.9283 5411.193 450.9327
43 2015 4810.01 400.8342 6465.092 538.7577 5264.776 438.7313
44 2016 4850.87 404.2392 6033.098 502.7582 5172.402 431.0335
45 2017 4878.64 406.5533 6111.580 509.2983 5130.589 427.5491
46 2018 4902.26 408.5217 6896.516 574.7097 5267.750 438.9792
47 2019 410.83 410.8300 1163.200 1163.2000 493.839 493.8390
metric_ts <- xts(x=metrics_df[c("carbon ppm", "btu consumed", "co2 emissions")], 
                 order.by=metrics_df$date_day)
plot(metric_ts, main = "Energy and Carbon PPM Metrics",
     legend.loc="bottomright", 
     col = seaborn_palette[1:3],
     yaxis.right=FALSE,
     axes=FALSE,
     lwd=1,
     cex.main=3,
     major.ticks="years",
     major.format="%Y",
     minor.format="%Y",
     grid.ticks.lty=1)

Seasonal Decomposition

carbonppm_ts <- ts(metrics_df$`carbon ppm`, start=c(1973, 1), frequency=12)
carbonppm_stl <- stl(carbonppm_ts, s.window="periodic")
plot(carbonppm_stl, col = seaborn_palette[1], 
     main="Seasonal Decomposition of Global Carbon PPM")

consumed_ts <- ts(metrics_df$`btu consumed`, start=c(1973, 1), frequency=12)
consumed_stl <- stl(consumed_ts, s.window="periodic")
plot(consumed_stl, col = seaborn_palette[2], 
     main="Seasonal Decomposition of U.S. Energy Consumption")

emissions_ts <- ts(metrics_df$`co2 emissions`, start=c(1973, 1), frequency=12)
emissions_stl <- stl(emissions_ts, s.window="periodic")
plot(emissions_stl, col = seaborn_palette[3],
     main = "Seasonal Decomposition of U.S. CO2 Emissions")

sql <- "WITH c1 AS
           (SELECT CONCAT((date_year/10)::int * 10, 's') as decade, 
                   REPLACE(description, 'Primary Energy Consumed by the ', '') AS \"Sector\",
                   SUM(energy_consumed) AS \"Primary Energy Consumed\"
            FROM consumption
            WHERE msn IN ('TXICBUS', 'TXRCBUS', 'TXACBUS', 'TXCCBUS', 'TXEIBUS')
            GROUP BY CONCAT((date_year/10)::int * 10, 's'), description
           ), c2 AS
           (SELECT CONCAT((date_year/10)::int * 10, 's') as decade, 
                   REPLACE(description, 'Total Energy Consumed by the ', '') AS \"Sector\",
                   SUM(energy_consumed) AS \"Total Energy Consumed\"
            FROM consumption
            WHERE msn IN ('TECCBUS', 'TEACBUS', 'TEICBUS', 'TERCBUS')
            GROUP BY CONCAT((date_year/10)::int * 10, 's'), description
           )
        SELECT c1.decade, c1.\"Sector\", c1.\"Primary Energy Consumed\", c2.\"Total Energy Consumed\"
        FROM c1
        LEFT JOIN c2
            ON c1.\"Sector\" = c2.\"Sector\" AND c1.decade = c2.decade
        ORDER BY c1.decade, c1.\"Sector\"
       "
consumed_df <- dbGetQuery(conn, sql)
kable_styling(kable(tail(consumed_df)),
              bootstrap_options = c("striped", "hover"))
decade Sector Primary Energy Consumed Total Energy Consumed
35 2000s Transportation Sector 546616.33 548112.4
36 2010s Commercial Sector 77171.71 326503.7
37 2010s Electric Power Sector 693591.66 NA
38 2010s Industrial Sector 388661.21 570178.3
39 2010s Residential Sector 117157.95 378492.5
40 2010s Transportation Sector 490149.36 491557.4

Consumption and CO2 Emissions

plot_mat <- with(subset(consumed_df, Sector != 'Electric Power Sector'),
                 tapply(`Total Energy Consumed`, list(decade, `Sector`), sum))
par(mar=c(5, 5, 2, 1))
barplot(plot_mat, main="U.S. Total Energy Consumption by Sector", cex.main=1.5,
        col=seaborn_palette[1:8], ylim=c(0, 8E5), xaxt="n", yaxt="n", beside=TRUE)
axis(side=1, at=c(5, 14, 23, 32), labels=colnames(plot_mat), padj=0.9, pos=c(0,0))
axis(side=2, at=axTicks(2), labels=format(axTicks(2), big.mark=',', scientific=FALSE), las=1)
legend("top", row.names(plot_mat), fill=seaborn_palette[1:8], ncol=8)

plot_mat <- with(consumed_df, tapply(`Primary Energy Consumed`, list(decade, `Sector`), sum))
par(mar=c(5, 5, 2, 1))
barplot(plot_mat, main="U.S. Primary Energy Consumption by Sector", cex.main=1.5,
        col=seaborn_palette[1:8], ylim=c(0, 9E5), xaxt="n", yaxt="n", beside=TRUE)
axis(side=1, at=c(5, 14, 23, 32, 41), labels=colnames(plot_mat), padj=0.9, pos=c(0,0))
axis(side=2, at=axTicks(2), labels=format(axTicks(2), big.mark=',', scientific=FALSE), las=1)
legend("top", row.names(plot_mat), fill=seaborn_palette[1:8], ncol=8)

sql <- "SELECT CONCAT((date_year/10)::int * 10, 's') as decade, 
               REPLACE(REPLACE(description, 'Total Energy ', ''), ' CO2 Emissions', '') AS \"Sector\",
               SUM(energy_co2) AS \"Total CO2 Emissions\"
        FROM us_co2_emissions
        WHERE msn IN ('TERCEUS', 'TECCEUS', 'TEACEUS', 'TXEIEUS')
        GROUP BY date_year, description
        ORDER BY date_year, description
       "
emissions_df <- dbGetQuery(conn, sql)
kable_styling(kable(tail(emissions_df)),
              bootstrap_options = c("striped", "hover"))
decade Sector Total CO2 Emissions
183 2010s Residential Sector 2034.691
184 2010s Transportation Sector 3830.401
185 2010s Commercial Sector 164.804
186 2010s Electric Power Sector 274.988
187 2010s Residential Sector 223.165
188 2010s Transportation Sector 295.360
plot_mat <- with(emissions_df, tapply(`Total CO2 Emissions`, list(decade, `Sector`), sum))
par(mar=c(5, 8, 2, 1))
barplot(plot_mat, main="U.S. CO2 Emissions by Sector", cex.main=1.5,
        col=seaborn_palette[1:5], ylim=c(0, 6E4), xaxt="n", yaxt="n", beside=TRUE)
axis(side=1, at=c(3, 9, 15, 21)+0.5, labels=colnames(plot_mat), padj=0.9, pos=c(0,0))
axis(side=2, at=axTicks(2), labels=format(axTicks(2), big.mark=',', scientific=FALSE), las=1)
legend("top", row.names(plot_mat), fill=seaborn_palette[1:5], ncol=5)

sql <- "WITH sub AS
          (SELECT CONCAT((date_year/10)::int * 10, 's') as decade, 
                  REPLACE($1, '%', '') as sector,
                  REPLACE(
                     REPLACE(
                       REPLACE(
                               REPLACE(description, 'Commercial Sector CO2 Emissions', ''), 
                               'Residential Sector CO2 Emissions', ''
                       ), 'Transportation Sector CO2 Emissions', ''),
                  ' ', '\n')  AS \"Type\",
                 energy_co2
           FROM us_co2_emissions
           WHERE description LIKE $2)
       SELECT decade, sector, \"Type\", 
              SUM(energy_co2) AS \"Total CO2 Emissions\"
       FROM sub
       GROUP BY decade, sector, \"Type\"
       ORDER BY decade, sector, \"Type\"
       "
params <- paste0(c("%Transportation", "%Residential", "%Commercial"), " Sector%")
emissions_type_df <- do.call(rbind, lapply(params, function(p) dbGetQuery(conn, sql, param=list(p, p))))
par(mfrow=c(3,2), mar=c(5, 5, 2, 1), mai = c(0.7, 0.2, 0.7, 0.2))
output <- by(emissions_type_df, emissions_type_df$sector, function(sub) {
  plot_mat <- with(sub, tapply(`Total CO2 Emissions`, list(decade, `Type`), sum))
  barplot(plot_mat, main=paste("U.S. CO2 Emissions by", sub$sector[[1]]), cex.main=1.5,
          col=seaborn_palette[1:5], ylim=c(0, ceiling(max(plot_mat, na.rm=TRUE)/1E4) * 1E4), 
          xaxt="n", yaxt="n", beside=TRUE)
  
  axis(side=1, at=c(3,9,15,21,27,33,39,45,51,57,63)[seq_along(colnames(plot_mat))] + 0.5,
       labels=colnames(plot_mat), padj=0.9, pos=c(0,0))
  axis(side=2, at=axTicks(2), labels=format(axTicks(2), big.mark=',', scientific=FALSE), las=1)
  legend("top", row.names(plot_mat), fill=seaborn_palette[1:5], ncol=5)
})

Renewable Energy: Production and Consumption

sql <- "SELECT energy_type,
               date,
               SUM(production) AS production,
               SUM(consumption) AS consumption
        FROM us_renewable_energy
        GROUP BY energy_type,
                 date
        ORDER BY energy_type,
                 date
       "
renewable_df <- dbGetQuery(conn, sql)
kable_styling(kable(tail(renewable_df)),
              bootstrap_options = c("striped", "hover"))
energy_type date production consumption
4678 Wood Energy 2019-02-01 190.887 182.491
4679 Wood Energy 2019-03-01 198.621 191.507
4680 Wood Energy 2019-04-01 195.791 187.670
4681 Wood Energy 2019-05-01 201.743 193.775
4682 Wood Energy 2019-06-01 198.379 189.036
4683 Wood Energy 2019-07-01 205.023 196.873
par(mfrow=c(3,3), mar=c(5, 5, 2, 1))
output <- by(renewable_df, renewable_df$energy_type, function(sub) {
  
  metric_ts <- xts(x=sub[c("production", "consumption")], 
                   order.by=sub$date)
  
  print(plot(metric_ts, main = sub$energy_type[1],
             legend.loc="bottomright", 
             col = seaborn_palette[1:3],
             yaxis.right=FALSE,
             axes=FALSE,
             lwd=1,
             cex.main=3,
             major.ticks="years",
             major.format="%Y",
             minor.format="%Y",
             grid.ticks.lty=1))
  
})

U.S. Geological Survey - Groundwater Well Depth Data

sql <- "SELECT g.year, g.month,
               avg(g.mean_value) as mean_value
        FROM groundwater g
        WHERE g.year BETWEEN 1990 AND 2019
        GROUP BY g.year, g.month"
groundwater_df <- dbGetQuery(conn, sql)
kable_styling(kable(tail(groundwater_df)),
              bootstrap_options = c("striped", "hover"))
year month mean_value
350 2019 2 63735.69
351 2019 3 60792.13
352 2019 4 54698.44
353 2019 5 53237.05
354 2019 6 49160.99
355 2019 7 76941.00
boxplot(mean_value ~ year, groundwater_df, col=seaborn_palette[1:10], 
        main="Groundwater Well Depth Mean Values", cex.main=1.5)

sql <- "WITH sites AS (
            SELECT CONCAT(g.year, '-', g.month, '-1')::date AS \"date\",
                   g.site_name,
                   AVG(g.mean_value) as mean_value
            FROM groundwater g
            WHERE g.year BETWEEN 1990 AND 2019
            GROUP BY CONCAT(g.year, '-', g.month, '-1')::date,
                     g.site_name
        ), bottom AS (
            SELECT site_name, AVG(mean_value) AS mean_value, 'bottom_sites' AS category
            FROM sites 
            GROUP BY site_name
            ORDER BY AVG(mean_value) ASC LIMIT 5
        ), top AS (
            SELECT site_name, AVG(mean_value) AS mean_value, 'top_sites' AS category
            FROM sites 
            GROUP BY site_name
            ORDER BY AVG(mean_value) DESC LIMIT 5
        )
        SELECT sites.\"date\", sites.mean_value, sites.site_name, top.category
        FROM sites
        INNER JOIN top
            ON sites.site_name = top.site_name
        UNION
        SELECT sites.\"date\", sites.mean_value, sites.site_name, bottom.category
        FROM sites
        INNER JOIN bottom
            ON sites.site_name = bottom.site_name
        ORDER BY category, site_name, \"date\""
groundwater_df <- dbGetQuery(conn, sql)
kable_styling(kable(head(groundwater_df)),
              bootstrap_options = c("striped", "hover"))
date mean_value site_name category
1991-06-01 31 FAIRPLAINS 2 WELL~ ST. CROIX~ USVI bottom_sites
1991-07-01 35 FAIRPLAINS 2 WELL~ ST. CROIX~ USVI bottom_sites
1991-08-01 41 FAIRPLAINS 2 WELL~ ST. CROIX~ USVI bottom_sites
1991-09-01 42 FAIRPLAINS 2 WELL~ ST. CROIX~ USVI bottom_sites
1991-10-01 36 FAIRPLAINS 2 WELL~ ST. CROIX~ USVI bottom_sites
1991-11-01 40 FAIRPLAINS 2 WELL~ ST. CROIX~ USVI bottom_sites
groundwater_df$year <- format(groundwater_df$date, "%Y")
par(mfrow=c(5,2), mar=c(5, 5, 2, 1))
output <- by(groundwater_df, groundwater_df$site_name, function(sub) {
  
  metric_ts <- xts(x=sub[c("mean_value")], 
                   order.by=sub$date)
  
  print(plot(metric_ts, main = paste(sub$site_name[1], ':', sub$category[1]),
             legend.loc="bottomright", 
             col = seaborn_palette[1:3],
             yaxis.right=FALSE,
             axes=FALSE,
             lwd=1,
             cex.main=3,
             major.ticks="years",
             major.format="%Y",
             minor.format="%Y",
             grid.ticks.lty=1))
  
})

U.S. FWS Endangered Species List

sql <- "SELECT CONCAT((f.date_year/10)::int * 10, 's') AS decade, 
               f.taxonomic_group,
               f.current_status,
               count(*) AS species_count
        FROM fws_species_year f
        WHERE f.current_status IN ('Endangered', 'Recovery', 'Resolved Taxon', 
                                   'Threatened', 'Extinction')
        GROUP BY CONCAT((f.date_year/10)::int * 10, 's'),
                 f.taxonomic_group,
                 f.current_status
        ORDER BY CONCAT((f.date_year/10)::int * 10, 's'),
                 f.taxonomic_group,
                 f.current_status"
fws_df <- dbGetQuery(conn, sql)
kable_styling(kable(tail(fws_df)),
              bootstrap_options = c("striped", "hover"))
decade taxonomic_group current_status species_count
141 2010s Mammals Resolved Taxon 1
142 2010s Mammals Threatened 11
143 2010s Reptiles Endangered 4
144 2010s Reptiles Threatened 9
145 2010s Snails Endangered 18
146 2010s Snails Threatened 1
par(mfrow=c(5,3), mar=c(5, 5, 2, 1))
output <- by(fws_df, fws_df$taxonomic_group, function(sub) {
  plot_mat <- with(sub, tapply(species_count, list(decade, current_status), sum))
  
  barplot(plot_mat, main=sub$taxonomic_group[[1]], cex.main=1.5,
          ylim = c(0, max(plot_mat, na.rm=TRUE)+5),
          col=seaborn_palette[seq_along(row.names(plot_mat))], beside=TRUE)
  legend("top", row.names(plot_mat), fill=seaborn_palette[seq_along(row.names(plot_mat))], 
         ncol=length(row.names(plot_mat)))
  box()
})

U.S. Department of Agriculture: Agriculture Census

sql <- "SELECT year,
               CASE
                    WHEN domain_category = '' THEN 'TOTAL\nOPERATIONS'
                    ELSE REPLACE(REPLACE(REPLACE(domain_category, '(', '\n('), 'TO', 'TO\n'), 'OR', 'OR\n')
               END AS domain_category, 
               value
        FROM ag_census 
        WHERE data_item = 'FARM OPERATIONS - NUMBER OF OPERATIONS'"
agcensus_df <- dbGetQuery(conn, sql)
plot_mat <- with(agcensus_df, tapply(value, list(year, domain_category), sum))
par(mar=c(5, 5, 2, 1))
barplot(plot_mat, main="U.S. Agriculture Census: Number of Farm Operations", cex.main=1.5,
        col=seaborn_palette[1:5], ylim=c(0, 2E6+5E5), beside=TRUE, xaxt="n", yaxt="n")
axis(side=1, at=c(3,9,15,21,27,33,39,45)+0.5, labels=colnames(plot_mat), tick=FALSE, padj=0.9, pos=c(0,0))
axis(side=2, at=axTicks(2), labels=format(axTicks(2), big.mark=',', scientific=FALSE), las=1)
legend("top", row.names(plot_mat), fill=seaborn_palette[1:5], ncol=5)

sql <- "WITH sub AS (
          SELECT year,
                 data_item,
                 REPLACE(
                    REPLACE(
                        REPLACE(
                           REPLACE(
                              REPLACE(domain_category, ';', ','),
                              '(', '\n('), 
                           'TO', 'TO\n'),
                        ' OR', ' OR\n'),
                    'LESS', 'LESS\n') AS domain_category, 
                 value
          FROM ag_census 
          WHERE data_item LIKE '%COMMODITY TOTALS%'
            AND value IS NOT NULL
       ) 
       SELECT year, 
              CASE
                  WHEN TRIM(domain_category) = '' 
                  THEN CASE 
                            WHEN data_item = 'COMMODITY TOTALS - SALES; MEASURED IN $' 
                            THEN 'TOTAL\nSALES'
                            WHEN data_item = 'COMMODITY TOTALS - SALES; MEASURED IN $ / OPERATION' 
                            THEN 'TOTAL\nSALES\nPER OPERATION'
                       END
                  ELSE domain_category 
              END AS domain_category,
              value
       FROM sub
      "
agcensus_df <- within(dbGetQuery(conn, sql),
                      domain_category <- factor(domain_category,
                                                levels = c("FARM SALES: \n(LESS\n THAN 2,500 $)", "FARM SALES: \n(2,500 TO\n 4,999 $)", 
                                                           "FARM SALES: \n(5,000 TO\n 9,999 $)", "FARM SALES: \n(10,000 TO\n 24,999 $)",
                                                           "FARM SALES: \n(25,000 TO\n 49,999 $)", "FARM SALES: \n(50,000 TO\n 99,999 $)",
                                                           "FARM SALES: \n(100,000 TO\n 499,999 $)", "FARM SALES: \n(500,000 OR\n MORE $)", 
                                                           "TOTAL\nSALES", "TOTAL\nSALES\nPER OPERATION"))
)
kable_styling(kable(tail(agcensus_df)),
              bootstrap_options = c("striped", "hover"))
year domain_category value
45 2017 FARM SALES: (5,000 TO 9,999 $) 208074
46 2017 FARM SALES: (10,000 TO 24,999 $) 228218
47 2017 FARM SALES: (25,000 TO 49,999 $) 144113
48 2017 FARM SALES: (50,000 TO 99,999 $) 119434
49 2017 FARM SALES: (100,000 TO 499,999 $) 218771
50 2017 FARM SALES: (500,000 OR MORE $) 146568
plot_mat <- with(agcensus_df[agcensus_df$domain_category != 'TOTAL\nSALES',], 
                 tapply(value, list(year, factor(domain_category)), sum))
par(mar=c(5, 5, 2, 1))
barplot(plot_mat, main="U.S. Agriculture Census: Commodity Totals - Farm Sales", cex.main=1.5,
        col=seaborn_palette[1:5], ylim=c(0, 1E6), beside=TRUE, xaxt="n", yaxt="n")
axis(side=1, at=c(3,9,15,21,27,33,39,45,51)+0.5, labels=colnames(plot_mat), tick=FALSE, padj=0.9, pos=c(0,0))
axis(side=2, at=axTicks(2), labels=format(axTicks(2), big.mark=',', scientific=FALSE), las=1)
legend("top", row.names(plot_mat), fill=seaborn_palette[1:5], ncol=5)

plot_mat <- with(agcensus_df[agcensus_df$domain_category == 'TOTAL\nSALES',], 
                 tapply(value, list(year, factor(domain_category)), sum))
par(mar=c(5, 8, 2, 1))
barplot(plot_mat, main="U.S. Agriculture Census: Commodity Totals - Overall Farm Sales", cex.main=1.5,
        col=seaborn_palette[1:5], ylim=c(0, 4E11+5E10), beside=TRUE, xaxt="n", yaxt="n", space=2)
        
axis(side=1, at=c(2.5,5.5,8.5,11.5,14.5), labels=row.names(plot_mat), padj=0.9, pos=c(0,0))
axis(side=2, at=axTicks(2), labels=format(axTicks(2), big.mark=',', scientific=FALSE), las=1)
legend("top", row.names(plot_mat), fill=seaborn_palette[1:5], ncol=5)

sql <- "SELECT year, 
               REPLACE(REPLACE(data_item, ' - OPERATIONS WITH AREA HARVESTED', ''), '; GRAIN', '') as crop,
               value
        FROM ag_census
        WHERE data_item IN ('CORN; GRAIN - OPERATIONS WITH AREA HARVESTED',
                            'WHEAT - OPERATIONS WITH AREA HARVESTED',
                            'SOYBEANS - OPERATIONS WITH AREA HARVESTED')
        ORDER BY year, data_item
       "
crops_df <- dbGetQuery(conn, sql)
kable_styling(kable(tail(crops_df)),
              bootstrap_options = c("striped", "hover"))
year crop value
10 2012 CORN 348530
11 2012 SOYBEANS 302963
12 2012 WHEAT 147632
13 2017 CORN 304801
14 2017 SOYBEANS 303191
15 2017 WHEAT 104792
plot_mat <- with(crops_df, tapply(value, 
                                  list(year, factor(crop, levels=c("WHEAT", "SOYBEANS", "CORN"))), sum)
)
par(mar=c(5, 8, 2, 1))
barplot(plot_mat, main="U.S. Agriculture Census: Number of Operations by Specific Crops", cex.main=1.5,
        col=seaborn_palette[1:5], ylim=c(0, 5E5), yaxt="n", beside=TRUE)
axis(side=2, at=axTicks(2), labels=format(axTicks(2), big.mark=',', scientific=FALSE), las=1)
legend("top", row.names(plot_mat), fill=seaborn_palette[1:5], ncol=5)

sql <- "SELECT year, 
               REPLACE(
                  REPLACE(
                     REPLACE(data_item, ' - OPERATIONS WITH SALES', ''), 
                     '; INCL CALVES', ''),
                  '; BROILERS', '') as livestock,
               value
        FROM ag_census
        WHERE data_item IN ('CATTLE; INCL CALVES - OPERATIONS WITH SALES',
                            'HOGS - OPERATIONS WITH SALES',
                            'CHICKENS; BROILERS - OPERATIONS WITH SALES')
        ORDER BY year, data_item
       "
livestock_df <- dbGetQuery(conn, sql)
kable_styling(kable(tail(livestock_df)),
              bootstrap_options = c("striped", "hover"))
year livestock value
10 2012 CATTLE 740978
11 2012 CHICKENS 32935
12 2012 HOGS 55882
13 2017 CATTLE 711827
14 2017 CHICKENS 32751
15 2017 HOGS 64871
plot_mat <- with(livestock_df, tapply(value, 
                                  list(year, factor(livestock, levels=c( "CHICKENS", "HOGS", "CATTLE"))), sum)
)
par(mar=c(5, 8, 2, 1))
barplot(plot_mat, main="U.S. Agriculture Census: Number of Operations by Specific Livestock", cex.main=1.5,
        col=seaborn_palette[1:5], ylim=c(0, 1E6+2E5), yaxt="n", beside=TRUE)
axis(side=2, at=axTicks(2), labels=format(axTicks(2), big.mark=',', scientific=FALSE), las=1)
legend("top", row.names(plot_mat), fill=seaborn_palette[1:5], ncol=5)

sql <- "SELECT year, 
               CASE
                   WHEN data_item = 'CROP TOTALS - SALES; MEASURED IN $' THEN 'CROP TOTALS'
                   WHEN data_item = 'ANIMAL TOTALS; INCL PRODUCTS - SALES; MEASURED IN $' THEN 'ANIMAL TOTALS'
               END as category,
               value
        FROM ag_census
        WHERE data_item IN ('CROP TOTALS - SALES; MEASURED IN $',
                            'ANIMAL TOTALS; INCL PRODUCTS - SALES; MEASURED IN $')
        ORDER BY year, data_item
       "
agtotal_df <- dbGetQuery(conn, sql)
kable_styling(kable(tail(agtotal_df)),
              bootstrap_options = c("striped", "hover"))
year category value
5 2007 ANIMAL TOTALS 153562563000
6 2007 CROP TOTALS 143657928000
7 2012 ANIMAL TOTALS 182247407000
8 2012 CROP TOTALS 212397074000
9 2017 ANIMAL TOTALS 194975996000
10 2017 CROP TOTALS 193546699000
plot_mat <- with(agtotal_df, tapply(value, list(year, category), sum))
par(mar=c(5, 8, 2, 1))
barplot(plot_mat, main="U.S. Agriculture Census: Crop vs Animal Sales", cex.main=1.5,
        col=seaborn_palette[1:5], ylim=c(0, 2E11+5E10), yaxt="n", beside=TRUE)
axis(side=2, at=axTicks(2), labels=format(axTicks(2), big.mark=',', scientific=FALSE), las=1)
legend("top", row.names(plot_mat), fill=seaborn_palette[1:5], ncol=5)

World Metrics

sql <- "WITH pop AS 
            (SELECT p.year,
                    p.population
             FROM world_population p
             WHERE p.country_name = 'World'
               AND p.year BETWEEN 2000 AND 2019
             ),
              
              land AS
            (SELECT a.year,
                    a.percent_arable
             FROM arable_land a
             WHERE a.country_name = 'World'
               AND a.year BETWEEN 2000 AND 2019
             ),
         
              fauna AS
            (SELECT i.year,
                    SUM(i.species_count) AS animals_count
             FROM iucn_species_count i
             WHERE i.year BETWEEN 2000 AND 2019
             GROUP BY i.year
             ),
         
              flora AS
            (SELECT p.assessment_year as year,
                    COUNT(*) AS plants_count
             FROM plants_assessments p
             WHERE p.assessment_year BETWEEN 2000 AND 2019
               AND p.interpreted_status = 'Threatened'
             GROUP BY p.assessment_year
             ),             
         
              ice AS
            (SELECT s.date_year as year,
                    AVG(s.extent) FILTER(WHERE s.region = 'Arctic') AS arctic_sea_ice_extent,
                    AVG(s.extent) FILTER(WHERE s.region = 'Antarctica') AS antarctic_sea_ice_extent
             FROM sea_ice_extent s
             WHERE s.date_year BETWEEN 2000 AND 2019
             GROUP BY s.date_year
             ),
             
              ocean AS
            (SELECT o.year as year,
                    AVG(o.tco2) AS total_co2,
                    AVG(o.phts25p0) AS ph_scale
             FROM ocean_data o
             WHERE o.year BETWEEN 2000 AND 2019
               AND o.tco2 <> -9999 AND o.phts25p0 <> -9999
             GROUP BY o.year
             ),
             
              temp AS
            (SELECT g.year as year,
                    AVG(g.global_mean) AS global_mean
             FROM global_temperature g
             WHERE g.year BETWEEN 2000 AND 2019
             GROUP BY g.year
             )
             
         SELECT pop.year, pop.population, land.percent_arable, fauna.animals_count, flora.plants_count,
                ice.arctic_sea_ice_extent, ice.antarctic_sea_ice_extent, 
                ocean.total_co2, ocean.ph_scale, temp.global_mean
         FROM pop 
         JOIN land USING (year)
         JOIN fauna USING (year)
         JOIN flora USING (year)
         JOIN ice USING (year)
         JOIN ocean USING (year)
         JOIN temp USING (year)
         ORDER BY pop.year"
env_world_df <- dbGetQuery(conn, sql)
kable_styling(kable(tail(env_world_df)), font_size = 12,
              bootstrap_options = c("striped", "hover"))
year population percent_arable animals_count plants_count arctic_sea_ice_extent antarctic_sea_ice_extent total_co2 ph_scale global_mean
7 2008 6765986891 10.80640 16928 3279 10.97785 12.23941 2181.326 7.685694 0.5155556
8 2009 6849272706 10.77478 17291 2294 10.93197 12.04859 2176.104 7.735574 0.6294444
9 2010 6932596129 10.74432 18351 3217 10.71139 12.10679 2191.072 7.702093 0.7022222
10 2011 7014792135 10.80189 19570 2850 10.48350 11.50057 2224.690 7.675097 0.5844444
11 2012 7099311892 10.87358 20219 7556 10.40610 12.00444 2187.112 7.731700 0.6183333
12 2013 7184861447 10.89469 21353 5465 10.89712 12.52361 2183.400 7.723483 0.6438889
kable_styling(kable(cor(env_world_df[-1])), font_size = 12,
              bootstrap_options = c("striped", "hover"))
population percent_arable animals_count plants_count arctic_sea_ice_extent antarctic_sea_ice_extent total_co2 ph_scale global_mean
population 1.0000000 0.1656147 0.9774118 0.7453433 -0.8026257 0.5047082 0.5832658 -0.1705909 0.5630321
percent_arable 0.1656147 1.0000000 0.2441270 0.4917138 0.0689147 0.3901610 -0.0431136 0.1567368 -0.3399306
animals_count 0.9774118 0.2441270 1.0000000 0.7450838 -0.8024194 0.5181381 0.6740957 -0.2912376 0.4783654
plants_count 0.7453433 0.4917138 0.7450838 1.0000000 -0.7224389 0.3576609 0.3673134 -0.1134181 0.3630865
arctic_sea_ice_extent -0.8026257 0.0689147 -0.8024194 -0.7224389 1.0000000 -0.0588615 -0.7926131 0.5220906 -0.5461557
antarctic_sea_ice_extent 0.5047082 0.3901610 0.5181381 0.3576609 -0.0588615 1.0000000 0.1042700 0.0767117 0.0713994
total_co2 0.5832658 -0.0431136 0.6740957 0.3673134 -0.7926131 0.1042700 1.0000000 -0.8165837 0.2653096
ph_scale -0.1705909 0.1567368 -0.2912376 -0.1134181 0.5220906 0.0767117 -0.8165837 1.0000000 0.0463203
global_mean 0.5630321 -0.3399306 0.4783654 0.3630865 -0.5461557 0.0713994 0.2653096 0.0463203 1.0000000
par(mfrow=c(4,2), mai = c(0.3, 0.2, 0.7, 0.2))
for(x in colnames(env_world_df)[3:ncol(env_world_df)]) {
  lfit <- loess(paste(x, "~ population"), data=env_world_df)
  
  plot(as.formula(paste(x, "~ population")), env_world_df, 
       main=paste("population and\n", gsub("_", " ", x)), cex.main=2,
       type="p", col=seaborn_palette[1], yaxt='n', xaxt='n', pch=19)
  axis(side=1, at=axTicks(1), labels=format(axTicks(1), big.mark=',', scientific=FALSE))
  axis(side=2, at=axTicks(2), labels=format(axTicks(2), big.mark=',', scientific=FALSE))
  pop_order <- order(env_world_df$population)
  lines(env_world_df$population[pop_order], lfit$fitted[pop_order], col=seaborn_palette[4], lwd=3)
}

dbDisconnect(conn)
[1] TRUE

User Data Application

# LINUX SHELL COMMAND CALL
system(paste0("gnome-terminal -- Rscript -e \"library(shiny); setwd('", getwd(), "'); runApp('EnvironmentDB_Shiny_App.R')\""))

Conclusion




  • Postgres as tool in data science workflow

  • Data persistence and hygiene

  • Centralization and security

  • Efficiency and usefulness

  • Scalability and applicability



---
title: "PostgreSQL As Data Science Database"
output: html_notebook
---

<style type="text/css">
.main-container {
  max-width: 1000px;
  margin-left: auto;
  margin-right: auto;
}
</style>

<br/>
<div style="font-size: 20px;">Chicago Postgres User Group Meetup, January 13, 2020</div>
<div style="float:left"><img src="IMAGES/postgresql_r.png" width="200px"/></div>
<br/><br/><br/><br/>

## Parfait Gasana ##
<div style="font-size: 20px;">Data Analyst, Winston & Strawn</div>
<div style="float:left"><img src="IMAGES/stackoverflow.png" width="30px"/></div>
<div style="font-size: 16px;">@Parfait (StackOverflow) | 
<img src="IMAGES/github.png" width="30px"/>&nbsp;&nbsp;@ParfaitG (GitHub)</div>

<hr style="border: none; height: 1px; background-color: #CCC;"/>

```{r setup, include=FALSE}
knitr::opts_chunk$set(root.dir = "/home/parfaitg/Documents/PGSQL")
```

<div style="float:right"><img src="IMAGES/open_source.png" width="75px"/></div>
<br/>

## Open Source Success Stories

<br/>

<div style="float:right"><img src="IMAGES/postgresql.jpeg" width="50px"/></div>
- ### PostgreSQL: powerful, extensible, analytical RDBMS
    - #### [Community](https://www.postgresql.org/community/): contributors, mailing lists, local user groups
<div style="float:right"><img src="IMAGES/r_logo.png" width="50px"/></div>
- ### R: data science stack, extensible environment
    - #### [CRAN](https://cran.r-project.org/) repository of packages
<div style="float:right"><img src="IMAGES/python.png" width="50px"/></div>
- ### Python: general purpose, extensible language
    - #### [PyPI](https://pypi.org/) ecosystem of modules

<div style="margin-left:40px;"/><h3>Challenging Commercial Market Share</h3></div>
<div style="float:left; margin-left:100px;"/><img src="IMAGES/commercial_tools.png" width="200px"/></div>
<br/>
<br/>
<br/>
<br/>

<hr style="border: none; height: 1px; background-color: #CCC;"/>

<br/>

<h2><span style="color: #336791"/>Relational Database Solution</span></h2>

<div style="float:right;"><img src="IMAGES/RDBMS_Group.png" width="250px"/></div>

- ### <span style="color: #336791"/>Data persistence: historical and current needs</span>
- ### <span style="color: #336791"/>Maintenance/hygiene: adherence to types</span>

<div style="float:right;"><img src="IMAGES/postgresql.jpeg" width="100px"/></div>

- ### <span style="color: #336791"/>Storage efficiency: normalization reduces repetition of data</span>
- ### <span style="color: #336791"/>Centralization: multiple user environment and security</span>
- ### <span style="color: #336791"/>Scalability: not limited to local resources</span>

<hr style="border: none; height: 1px; background-color: #CCC;"/>

<div style="float:right"><img src="IMAGES/docker_pgsql.png" width="150px"/></div>
## <span style="color: #336791"/>Use Case: Environment Database</a></span>

<h3><span style="color: #336791"/><i>Human Impact on the Global and Local Biosphere and Climate</i></span></h3>
<div style="float:center; text-align:center;"><img src="IMAGES/env_data.png"/></div>
<br/>

<hr style="border: none; height: 1px; background-color: #CCC;"/>

```{r}
suppressMessages(library(kableExtra))
suppressMessages(library(xts))

seaborn_palette <- c("#4C72B0", "#DD8452", "#55A868", "#C44E52", "#8172B3", "#937860", 
                     "#DA8BC3", "#8C8C8C", "#CCB974", "#64B5CD", "#4C72B0", "#DD8452")
```

## Connection

- ### Centralized location for multiple users
- ### Well maintained API modules
- ### Multiple backend connection types

### <span style="color: #336791"/><a href="https://www.postgresql.org/ftp/odbc/versions/src/" target="blank">ODBC</a></span>

```{r}
library(DBI)
library(odbc)

conn <- dbConnect(odbc::odbc(), driver="PostgreSQL Unicode", 
                  server="localhost", database="environment",
                  uid="postgres", pwd="env19", port=6432)
dbListTables(conn)

dbDisconnect(conn)
```

### <span style="color: #336791"/><a href="https://jdbc.postgresql.org/download.html" target="blank">JDBC</a></span>

```{r}
library(rJava)
library(RJDBC)

drv <- JDBC("org.postgresql.Driver",
           "/usr/lib/jvm/java-8-oracle/lib/postgresql-42.2.2.jar")
conn <- dbConnect(drv, "jdbc:postgresql://localhost:6432/environment", "postgres", "env19")
dbListTables(conn, schema="public")

dbDisconnect(conn)
```


### <span style="color: #336791"/><a href="https://rdrr.io/cran/RPostgreSQL/" target="blank">R-Postgres API</a></span>

```{r}
library(RPostgreSQL)

conn <- dbConnect(RPostgreSQL::PostgreSQL(), host="localhost", dbname="environment",
                  user="postgres", password="env19", port=6432)
dbListTables(conn)
```

<hr style="border: none; height: 1px; background-color: #CCC;"/>

## Import & Manage Data

- #### Seamless bulk text file import
- #### Data hygiene support
- #### Annotate sources with comments

```{sql, eval=FALSE}
CREATE TABLE global_temperature (
   id SERIAL NOT NULL PRIMARY KEY,
   year INT,
   period VARCHAR(50),
   global_mean NUMERIC(5,2)
);

\copy global_temperature (year, period, global_mean) FROM '/home/parfaitg/Databases/SQL_Server/ENVIRONMENT/global_temperature.csv' DELIMITER ',' CSV HEADER;

COMMENT ON TABLE global_temperature IS 'Source: NASA - Combined Land-Surface Air and Sea-Surface Water Temperature Anomalies (Land-Ocean Temperature Index, LOTI) (https://data.giss.nasa.gov/gistemp/)';
```

```{sql connection=conn, output.var = "comments_df"}
SELECT relname as table, obj_description(oid) as comment
FROM pg_class
WHERE relkind = 'r'
  AND obj_description(oid) IS NOT NULL
  ORDER BY relname
```

```{r}
kable_styling(kable(comments_df),
              bootstrap_options = c("striped", "hover"))
```

<hr style="border: none; height: 1px; background-color: #CCC;"/>

## Analyze Data

- #### Decalarative SQL to interact with many data
- #### Leverage query engine for data processing
- #### Readable, maintainable, tranferable

<div style="float:right"><a href="https://www.esrl.noaa.gov/gmd/obop/mlo/" blank="target"><img src="IMAGES/noaa.png" height="50px"/></a></div>
<div style="float:right"><a href="https://www.eia.gov/totalenergy/data/monthly/" blank ="taget"><img src="IMAGES/doe_eia.png" height="50px"/></a></div>


### <span style="color: #336791"/>Department of Energy's EIA: Monthly Energy Review &</span>
### <span style="color: #336791"/>NOAA Mauna Loa, Hawaii Observatory - Carbon PPM</span>

```{sql connection=conn, output.var = "metrics_df"}
SELECT p.date_year, p.date_month, CONCAT(p.date_year, '-', p.date_month, '-01')::date as date_day,
               p.average_ppm as "carbon ppm", c.energy_consumed as "btu consumed", e.energy_co2 as "co2 emissions"
  FROM ppm_month p
  JOIN consumption c 
    ON p.date_year = c.date_year AND p.date_month = c.date_month and c.msn = 'TXRCBUS'
  JOIN us_co2_emissions e
    ON p.date_year = e.date_year AND p.date_month = e.date_month and e.msn = 'TETCEUS'
  ORDER BY p.date_year, p.date_month
```

```{sql connection=conn, output.var = "agg_df"}
SELECT p.date_year,
       sum(p.average_ppm) as carbon_ppm_total, 
       avg(p.average_ppm) as carbon_ppm_mean, 
       sum(c.energy_consumed) as btu_consumed_total,
       avg(c.energy_consumed) as btu_consumed_mean,
       sum(e.energy_co2) as co2_emissions_total,
       avg(e.energy_co2) as co2_emissions_mean
  FROM ppm_month p
  JOIN consumption c 
    ON p.date_year = c.date_year AND p.date_month = c.date_month and c.msn = 'TXRCBUS'
  JOIN us_co2_emissions e
    ON p.date_year = e.date_year AND p.date_month = e.date_month and e.msn = 'TETCEUS'
GROUP BY p.date_year
ORDER BY p.date_year
```

```{r}
kable_styling(kable(tail(agg_df, 10)),
              bootstrap_options = c("striped", "hover"))
```

```{r fig1a, fig.height = 7, fig.width = 13, fig.align = "center"}
metric_ts <- xts(x=metrics_df[c("carbon ppm", "btu consumed", "co2 emissions")], 
                 order.by=metrics_df$date_day)

plot(metric_ts, main = "Energy and Carbon PPM Metrics",
     legend.loc="bottomright", 
     col = seaborn_palette[1:3],
     yaxis.right=FALSE,
     axes=FALSE,
     lwd=1,
     cex.main=3,
     major.ticks="years",
     major.format="%Y",
     minor.format="%Y",
     grid.ticks.lty=1)
```

### <span style="color: #336791"/>Seasonal Decomposition</span>

```{r  fig1b, fig.height = 7, fig.width = 13, fig.align = "center"}
carbonppm_ts <- ts(metrics_df$`carbon ppm`, start=c(1973, 1), frequency=12)
carbonppm_stl <- stl(carbonppm_ts, s.window="periodic")

plot(carbonppm_stl, col = seaborn_palette[1], 
     main="Seasonal Decomposition of Global Carbon PPM")
```

```{r  fig1c, fig.height = 7, fig.width = 13, fig.align = "center"}
consumed_ts <- ts(metrics_df$`btu consumed`, start=c(1973, 1), frequency=12)
consumed_stl <- stl(consumed_ts, s.window="periodic")

plot(consumed_stl, col = seaborn_palette[2], 
     main="Seasonal Decomposition of U.S. Energy Consumption")
```

```{r fig1d, fig.height = 7, fig.width = 13, fig.align = "center"}
emissions_ts <- ts(metrics_df$`co2 emissions`, start=c(1973, 1), frequency=12)
emissions_stl <- stl(emissions_ts, s.window="periodic")

plot(emissions_stl, col = seaborn_palette[3],
     main = "Seasonal Decomposition of U.S. CO2 Emissions")
```

```{r}
sql <- "WITH c1 AS
           (SELECT CONCAT((date_year/10)::int * 10, 's') as decade, 
                   REPLACE(description, 'Primary Energy Consumed by the ', '') AS \"Sector\",
                   SUM(energy_consumed) AS \"Primary Energy Consumed\"
            FROM consumption
            WHERE msn IN ('TXICBUS', 'TXRCBUS', 'TXACBUS', 'TXCCBUS', 'TXEIBUS')
            GROUP BY CONCAT((date_year/10)::int * 10, 's'), description
           ), c2 AS
           (SELECT CONCAT((date_year/10)::int * 10, 's') as decade, 
                   REPLACE(description, 'Total Energy Consumed by the ', '') AS \"Sector\",
                   SUM(energy_consumed) AS \"Total Energy Consumed\"
            FROM consumption
            WHERE msn IN ('TECCBUS', 'TEACBUS', 'TEICBUS', 'TERCBUS')
            GROUP BY CONCAT((date_year/10)::int * 10, 's'), description
           )

        SELECT c1.decade, c1.\"Sector\", c1.\"Primary Energy Consumed\", c2.\"Total Energy Consumed\"
        FROM c1
        LEFT JOIN c2
            ON c1.\"Sector\" = c2.\"Sector\" AND c1.decade = c2.decade
        ORDER BY c1.decade, c1.\"Sector\"
       "

consumed_df <- dbGetQuery(conn, sql)

kable_styling(kable(tail(consumed_df)),
              bootstrap_options = c("striped", "hover"))
```

<div style="float:right"><img src="IMAGES/doe_eia.png" width="75px"/></div>
### <span style="color: #336791"/>Consumption and CO2 Emissions</span>


```{r fig1e, fig.height = 6, fig.width = 15, fig.align = "center"}
plot_mat <- with(subset(consumed_df, Sector != 'Electric Power Sector'),
                 tapply(`Total Energy Consumed`, list(decade, `Sector`), sum))

par(mar=c(5, 5, 2, 1))
barplot(plot_mat, main="U.S. Total Energy Consumption by Sector", cex.main=1.5,
        col=seaborn_palette[1:8], ylim=c(0, 8E5), xaxt="n", yaxt="n", beside=TRUE)

axis(side=1, at=c(5, 14, 23, 32), labels=colnames(plot_mat), padj=0.9, pos=c(0,0))
axis(side=2, at=axTicks(2), labels=format(axTicks(2), big.mark=',', scientific=FALSE), las=1)
legend("top", row.names(plot_mat), fill=seaborn_palette[1:8], ncol=8)
```

```{r fig1f, fig.height = 6, fig.width = 15, fig.align = "center"}
plot_mat <- with(consumed_df, tapply(`Primary Energy Consumed`, list(decade, `Sector`), sum))

par(mar=c(5, 5, 2, 1))
barplot(plot_mat, main="U.S. Primary Energy Consumption by Sector", cex.main=1.5,
        col=seaborn_palette[1:8], ylim=c(0, 9E5), xaxt="n", yaxt="n", beside=TRUE)

axis(side=1, at=c(5, 14, 23, 32, 41), labels=colnames(plot_mat), padj=0.9, pos=c(0,0))
axis(side=2, at=axTicks(2), labels=format(axTicks(2), big.mark=',', scientific=FALSE), las=1)
legend("top", row.names(plot_mat), fill=seaborn_palette[1:8], ncol=8)
```

```{r}
sql <- "SELECT CONCAT((date_year/10)::int * 10, 's') as decade, 
               REPLACE(REPLACE(description, 'Total Energy ', ''), ' CO2 Emissions', '') AS \"Sector\",
               SUM(energy_co2) AS \"Total CO2 Emissions\"
        FROM us_co2_emissions
        WHERE msn IN ('TERCEUS', 'TECCEUS', 'TEACEUS', 'TXEIEUS')
        GROUP BY date_year, description
        ORDER BY date_year, description
       "

emissions_df <- dbGetQuery(conn, sql)

kable_styling(kable(tail(emissions_df)),
              bootstrap_options = c("striped", "hover"))
```

```{r fig1g, fig.height = 6, fig.width = 15, fig.align = "center"}
plot_mat <- with(emissions_df, tapply(`Total CO2 Emissions`, list(decade, `Sector`), sum))

par(mar=c(5, 8, 2, 1))
barplot(plot_mat, main="U.S. CO2 Emissions by Sector", cex.main=1.5,
        col=seaborn_palette[1:5], ylim=c(0, 6E4), xaxt="n", yaxt="n", beside=TRUE)

axis(side=1, at=c(3, 9, 15, 21)+0.5, labels=colnames(plot_mat), padj=0.9, pos=c(0,0))
axis(side=2, at=axTicks(2), labels=format(axTicks(2), big.mark=',', scientific=FALSE), las=1)
legend("top", row.names(plot_mat), fill=seaborn_palette[1:5], ncol=5)
```


```{r}
sql <- "WITH sub AS
          (SELECT CONCAT((date_year/10)::int * 10, 's') as decade, 
                  REPLACE($1, '%', '') as sector,
                  REPLACE(
                     REPLACE(
                       REPLACE(
                               REPLACE(description, 'Commercial Sector CO2 Emissions', ''), 
                               'Residential Sector CO2 Emissions', ''
                       ), 'Transportation Sector CO2 Emissions', ''),
                  ' ', '\n')  AS \"Type\",
                 energy_co2
           FROM us_co2_emissions
           WHERE description LIKE $2)

       SELECT decade, sector, \"Type\", 
              SUM(energy_co2) AS \"Total CO2 Emissions\"
       FROM sub
       GROUP BY decade, sector, \"Type\"
       ORDER BY decade, sector, \"Type\"
       "

params <- paste0(c("%Transportation", "%Residential", "%Commercial"), " Sector%")
emissions_type_df <- do.call(rbind, lapply(params, function(p) dbGetQuery(conn, sql, param=list(p, p))))
```

```{r fig1h, fig.height = 12, fig.width = 15, fig.align = "center"}

par(mfrow=c(3,2), mar=c(5, 5, 2, 1), mai = c(0.7, 0.2, 0.7, 0.2))
output <- by(emissions_type_df, emissions_type_df$sector, function(sub) {
  plot_mat <- with(sub, tapply(`Total CO2 Emissions`, list(decade, `Type`), sum))

  barplot(plot_mat, main=paste("U.S. CO2 Emissions by", sub$sector[[1]]), cex.main=1.5,
          col=seaborn_palette[1:5], ylim=c(0, ceiling(max(plot_mat, na.rm=TRUE)/1E4) * 1E4), 
          xaxt="n", yaxt="n", beside=TRUE)
  
  axis(side=1, at=c(3,9,15,21,27,33,39,45,51,57,63)[seq_along(colnames(plot_mat))] + 0.5,
       labels=colnames(plot_mat), padj=0.9, pos=c(0,0))
  axis(side=2, at=axTicks(2), labels=format(axTicks(2), big.mark=',', scientific=FALSE), las=1)
  legend("top", row.names(plot_mat), fill=seaborn_palette[1:5], ncol=5)
})
```

<div style="float:right"><img src="IMAGES/doe_eia.png" width="75px"/></div>
### <span style="color: #336791"/>Renewable Energy: Production and Consumption</span>

```{r}
sql <- "SELECT energy_type,
               date,
               SUM(production) AS production,
               SUM(consumption) AS consumption
        FROM us_renewable_energy
        GROUP BY energy_type,
                 date
        ORDER BY energy_type,
                 date
       "

renewable_df <- dbGetQuery(conn, sql)

kable_styling(kable(tail(renewable_df)),
              bootstrap_options = c("striped", "hover"))
```

```{r renewable_fig, fig.height = 15, fig.width = 15, fig.align = "center"}

par(mfrow=c(3,3), mar=c(5, 5, 2, 1))
output <- by(renewable_df, renewable_df$energy_type, function(sub) {
  
  metric_ts <- xts(x=sub[c("production", "consumption")], 
                   order.by=sub$date)
  
  print(plot(metric_ts, main = sub$energy_type[1],
             legend.loc="bottomright", 
             col = seaborn_palette[1:3],
             yaxis.right=FALSE,
             axes=FALSE,
             lwd=1,
             cex.main=3,
             major.ticks="years",
             major.format="%Y",
             minor.format="%Y",
             grid.ticks.lty=1))
  
})
```

<div style="float:right"><a href="https://waterdata.usgs.gov/nwis/gw"><img src="IMAGES/usgs.png" width="75px"/></a></div>

### <span style="color: #336791"/>U.S. Geological Survey - Groundwater Well Depth Data</span>
```{r}
sql <- "SELECT g.year, g.month,
               avg(g.mean_value) as mean_value
        FROM groundwater g
        WHERE g.year BETWEEN 1990 AND 2019
        GROUP BY g.year, g.month"

groundwater_df <- dbGetQuery(conn, sql)

kable_styling(kable(tail(groundwater_df)),
              bootstrap_options = c("striped", "hover"))
```

```{r fig3, fig.height = 7, fig.width = 16, fig.align = "center"}
boxplot(mean_value ~ year, groundwater_df, col=seaborn_palette[1:10], 
        main="Groundwater Well Depth Mean Values", cex.main=1.5)
```


```{r}
sql <- "WITH sites AS (
            SELECT CONCAT(g.year, '-', g.month, '-1')::date AS \"date\",
                   g.site_name,
                   AVG(g.mean_value) as mean_value
            FROM groundwater g
            WHERE g.year BETWEEN 1990 AND 2019
            GROUP BY CONCAT(g.year, '-', g.month, '-1')::date,
                     g.site_name
        ), bottom AS (
            SELECT site_name, AVG(mean_value) AS mean_value, 'bottom_sites' AS category
            FROM sites 
            GROUP BY site_name
            ORDER BY AVG(mean_value) ASC LIMIT 5
        ), top AS (
            SELECT site_name, AVG(mean_value) AS mean_value, 'top_sites' AS category
            FROM sites 
            GROUP BY site_name
            ORDER BY AVG(mean_value) DESC LIMIT 5
        )

        SELECT sites.\"date\", sites.mean_value, sites.site_name, top.category
        FROM sites
        INNER JOIN top
            ON sites.site_name = top.site_name

        UNION

        SELECT sites.\"date\", sites.mean_value, sites.site_name, bottom.category
        FROM sites
        INNER JOIN bottom
            ON sites.site_name = bottom.site_name

        ORDER BY category, site_name, \"date\""

groundwater_df <- dbGetQuery(conn, sql)

kable_styling(kable(head(groundwater_df)),
              bootstrap_options = c("striped", "hover"))
```

```{r groundwater_fig, fig.height = 25, fig.width = 15, fig.align = "center"}

groundwater_df$year <- format(groundwater_df$date, "%Y")

par(mfrow=c(5,2), mar=c(5, 5, 2, 1))
output <- by(groundwater_df, groundwater_df$site_name, function(sub) {
  
  metric_ts <- xts(x=sub[c("mean_value")], 
                   order.by=sub$date)
  
  print(plot(metric_ts, main = paste(sub$site_name[1], ':', sub$category[1]),
             legend.loc="bottomright", 
             col = seaborn_palette[1:3],
             yaxis.right=FALSE,
             axes=FALSE,
             lwd=1,
             cex.main=3,
             major.ticks="years",
             major.format="%Y",
             minor.format="%Y",
             grid.ticks.lty=1))
  
})
```

<div style="float:right"><a href="https://www.fws.gov/endangered/" target="blank"><img src="IMAGES/us_fws.png" width="75px"/></a></div>
### U.S. FWS Endangered Species List

```{r}
sql <- "SELECT CONCAT((f.date_year/10)::int * 10, 's') AS decade, 
               f.taxonomic_group,
               f.current_status,
               count(*) AS species_count
        FROM fws_species_year f
        WHERE f.current_status IN ('Endangered', 'Recovery', 'Resolved Taxon', 
                                   'Threatened', 'Extinction')
        GROUP BY CONCAT((f.date_year/10)::int * 10, 's'),
                 f.taxonomic_group,
                 f.current_status
        ORDER BY CONCAT((f.date_year/10)::int * 10, 's'),
                 f.taxonomic_group,
                 f.current_status"

fws_df <- dbGetQuery(conn, sql)

kable_styling(kable(tail(fws_df)),
              bootstrap_options = c("striped", "hover"))
```

```{r fws_fig, fig.height = 15, fig.width = 15, fig.align = "center"}

par(mfrow=c(5,3), mar=c(5, 5, 2, 1))
output <- by(fws_df, fws_df$taxonomic_group, function(sub) {
  plot_mat <- with(sub, tapply(species_count, list(decade, current_status), sum))
  
  barplot(plot_mat, main=sub$taxonomic_group[[1]], cex.main=1.5,
          ylim = c(0, max(plot_mat, na.rm=TRUE)+5),
          col=seaborn_palette[seq_along(row.names(plot_mat))], beside=TRUE)
  legend("top", row.names(plot_mat), fill=seaborn_palette[seq_along(row.names(plot_mat))], 
         ncol=length(row.names(plot_mat)))
  box()
})
```


<div style="float:right"><a href="https://www.nass.usda.gov/AgCensus/" target="blank"><img src="IMAGES/usda.png" width="100px"/></a></div>
### <span style="color: #336791"/>U.S. Department of Agriculture: Agriculture Census</span>

```{r fig4, fig.height = 6, fig.width = 17, fig.align = "center"}
sql <- "SELECT year,
               CASE
                    WHEN domain_category = '' THEN 'TOTAL\nOPERATIONS'
                    ELSE REPLACE(REPLACE(REPLACE(domain_category, '(', '\n('), 'TO', 'TO\n'), 'OR', 'OR\n')
               END AS domain_category, 
               value
        FROM ag_census 
        WHERE data_item = 'FARM OPERATIONS - NUMBER OF OPERATIONS'"

agcensus_df <- dbGetQuery(conn, sql)

plot_mat <- with(agcensus_df, tapply(value, list(year, domain_category), sum))

par(mar=c(5, 5, 2, 1))
barplot(plot_mat, main="U.S. Agriculture Census: Number of Farm Operations", cex.main=1.5,
        col=seaborn_palette[1:5], ylim=c(0, 2E6+5E5), beside=TRUE, xaxt="n", yaxt="n")

axis(side=1, at=c(3,9,15,21,27,33,39,45)+0.5, labels=colnames(plot_mat), tick=FALSE, padj=0.9, pos=c(0,0))
axis(side=2, at=axTicks(2), labels=format(axTicks(2), big.mark=',', scientific=FALSE), las=1)
legend("top", row.names(plot_mat), fill=seaborn_palette[1:5], ncol=5)
```

```{r}
sql <- "WITH sub AS (
          SELECT year,
                 data_item,
                 REPLACE(
                    REPLACE(
                        REPLACE(
                           REPLACE(
                              REPLACE(domain_category, ';', ','),
                              '(', '\n('), 
                           'TO', 'TO\n'),
                        ' OR', ' OR\n'),
                    'LESS', 'LESS\n') AS domain_category, 
                 value
          FROM ag_census 
          WHERE data_item LIKE '%COMMODITY TOTALS%'
            AND value IS NOT NULL
       ) 

       SELECT year, 
              CASE
                  WHEN TRIM(domain_category) = '' 
                  THEN CASE 
                            WHEN data_item = 'COMMODITY TOTALS - SALES; MEASURED IN $' 
                            THEN 'TOTAL\nSALES'
                            WHEN data_item = 'COMMODITY TOTALS - SALES; MEASURED IN $ / OPERATION' 
                            THEN 'TOTAL\nSALES\nPER OPERATION'
                       END
                  ELSE domain_category 
              END AS domain_category,
              value
       FROM sub
      "

agcensus_df <- within(dbGetQuery(conn, sql),
                      domain_category <- factor(domain_category,
                                                levels = c("FARM SALES: \n(LESS\n THAN 2,500 $)", "FARM SALES: \n(2,500 TO\n 4,999 $)", 
                                                           "FARM SALES: \n(5,000 TO\n 9,999 $)", "FARM SALES: \n(10,000 TO\n 24,999 $)",
                                                           "FARM SALES: \n(25,000 TO\n 49,999 $)", "FARM SALES: \n(50,000 TO\n 99,999 $)",
                                                           "FARM SALES: \n(100,000 TO\n 499,999 $)", "FARM SALES: \n(500,000 OR\n MORE $)", 
                                                           "TOTAL\nSALES", "TOTAL\nSALES\nPER OPERATION"))
)

kable_styling(kable(tail(agcensus_df)),
              bootstrap_options = c("striped", "hover"))
```

```{r fig5, fig.height = 6, fig.width = 17, fig.align = "center"}
plot_mat <- with(agcensus_df[agcensus_df$domain_category != 'TOTAL\nSALES',], 
                 tapply(value, list(year, factor(domain_category)), sum))

par(mar=c(5, 5, 2, 1))
barplot(plot_mat, main="U.S. Agriculture Census: Commodity Totals - Farm Sales", cex.main=1.5,
        col=seaborn_palette[1:5], ylim=c(0, 1E6), beside=TRUE, xaxt="n", yaxt="n")

axis(side=1, at=c(3,9,15,21,27,33,39,45,51)+0.5, labels=colnames(plot_mat), tick=FALSE, padj=0.9, pos=c(0,0))
axis(side=2, at=axTicks(2), labels=format(axTicks(2), big.mark=',', scientific=FALSE), las=1)
legend("top", row.names(plot_mat), fill=seaborn_palette[1:5], ncol=5)
```


```{r fig6, fig.height = 6, fig.width = 15, fig.align = "center"}
plot_mat <- with(agcensus_df[agcensus_df$domain_category == 'TOTAL\nSALES',], 
                 tapply(value, list(year, factor(domain_category)), sum))

par(mar=c(5, 8, 2, 1))
barplot(plot_mat, main="U.S. Agriculture Census: Commodity Totals - Overall Farm Sales", cex.main=1.5,
        col=seaborn_palette[1:5], ylim=c(0, 4E11+5E10), beside=TRUE, xaxt="n", yaxt="n", space=2)
        
axis(side=1, at=c(2.5,5.5,8.5,11.5,14.5), labels=row.names(plot_mat), padj=0.9, pos=c(0,0))
axis(side=2, at=axTicks(2), labels=format(axTicks(2), big.mark=',', scientific=FALSE), las=1)
legend("top", row.names(plot_mat), fill=seaborn_palette[1:5], ncol=5)
```

```{r}
sql <- "SELECT year, 
               REPLACE(REPLACE(data_item, ' - OPERATIONS WITH AREA HARVESTED', ''), '; GRAIN', '') as crop,
               value
        FROM ag_census
        WHERE data_item IN ('CORN; GRAIN - OPERATIONS WITH AREA HARVESTED',
                            'WHEAT - OPERATIONS WITH AREA HARVESTED',
                            'SOYBEANS - OPERATIONS WITH AREA HARVESTED')
        ORDER BY year, data_item
       "

crops_df <- dbGetQuery(conn, sql)

kable_styling(kable(tail(crops_df)),
              bootstrap_options = c("striped", "hover"))
```

```{r fig7, fig.height = 6, fig.width = 15, fig.align = "center"}
plot_mat <- with(crops_df, tapply(value, 
                                  list(year, factor(crop, levels=c("WHEAT", "SOYBEANS", "CORN"))), sum)
)

par(mar=c(5, 8, 2, 1))
barplot(plot_mat, main="U.S. Agriculture Census: Number of Operations by Specific Crops", cex.main=1.5,
        col=seaborn_palette[1:5], ylim=c(0, 5E5), yaxt="n", beside=TRUE)

axis(side=2, at=axTicks(2), labels=format(axTicks(2), big.mark=',', scientific=FALSE), las=1)
legend("top", row.names(plot_mat), fill=seaborn_palette[1:5], ncol=5)
```

```{r}
sql <- "SELECT year, 
               REPLACE(
                  REPLACE(
                     REPLACE(data_item, ' - OPERATIONS WITH SALES', ''), 
                     '; INCL CALVES', ''),
                  '; BROILERS', '') as livestock,
               value
        FROM ag_census
        WHERE data_item IN ('CATTLE; INCL CALVES - OPERATIONS WITH SALES',
                            'HOGS - OPERATIONS WITH SALES',
                            'CHICKENS; BROILERS - OPERATIONS WITH SALES')
        ORDER BY year, data_item
       "

livestock_df <- dbGetQuery(conn, sql)

kable_styling(kable(tail(livestock_df)),
              bootstrap_options = c("striped", "hover"))
```


```{r fig8, fig.height = 6, fig.width = 15, fig.align = "center"}
plot_mat <- with(livestock_df, tapply(value, 
                                  list(year, factor(livestock, levels=c( "CHICKENS", "HOGS", "CATTLE"))), sum)
)

par(mar=c(5, 8, 2, 1))
barplot(plot_mat, main="U.S. Agriculture Census: Number of Operations by Specific Livestock", cex.main=1.5,
        col=seaborn_palette[1:5], ylim=c(0, 1E6+2E5), yaxt="n", beside=TRUE)

axis(side=2, at=axTicks(2), labels=format(axTicks(2), big.mark=',', scientific=FALSE), las=1)
legend("top", row.names(plot_mat), fill=seaborn_palette[1:5], ncol=5)
```


```{r}
sql <- "SELECT year, 
               CASE
                   WHEN data_item = 'CROP TOTALS - SALES; MEASURED IN $' THEN 'CROP TOTALS'
                   WHEN data_item = 'ANIMAL TOTALS; INCL PRODUCTS - SALES; MEASURED IN $' THEN 'ANIMAL TOTALS'
               END as category,
               value
        FROM ag_census
        WHERE data_item IN ('CROP TOTALS - SALES; MEASURED IN $',
                            'ANIMAL TOTALS; INCL PRODUCTS - SALES; MEASURED IN $')
        ORDER BY year, data_item
       "

agtotal_df <- dbGetQuery(conn, sql)

kable_styling(kable(tail(agtotal_df)),
              bootstrap_options = c("striped", "hover"))
```

```{r fig9, fig.height = 6, fig.width = 15, fig.align = "center"}
plot_mat <- with(agtotal_df, tapply(value, list(year, category), sum))

par(mar=c(5, 8, 2, 1))
barplot(plot_mat, main="U.S. Agriculture Census: Crop vs Animal Sales", cex.main=1.5,
        col=seaborn_palette[1:5], ylim=c(0, 2E11+5E10), yaxt="n", beside=TRUE)

axis(side=2, at=axTicks(2), labels=format(axTicks(2), big.mark=',', scientific=FALSE), las=1)
legend("top", row.names(plot_mat), fill=seaborn_palette[1:5], ncol=5)
```

<div style="float:right"><img src="IMAGES/world_metrics.png" width="300px"/></div>
### World Metrics

```{r}
sql <- "WITH pop AS 
            (SELECT p.year,
                    p.population
             FROM world_population p
             WHERE p.country_name = 'World'
               AND p.year BETWEEN 2000 AND 2019
             ),
              
              land AS
            (SELECT a.year,
                    a.percent_arable
             FROM arable_land a
             WHERE a.country_name = 'World'
               AND a.year BETWEEN 2000 AND 2019
             ),
         
              fauna AS
            (SELECT i.year,
                    SUM(i.species_count) AS animals_count
             FROM iucn_species_count i
             WHERE i.year BETWEEN 2000 AND 2019
             GROUP BY i.year
             ),
         
              flora AS
            (SELECT p.assessment_year as year,
                    COUNT(*) AS plants_count
             FROM plants_assessments p
             WHERE p.assessment_year BETWEEN 2000 AND 2019
               AND p.interpreted_status = 'Threatened'
             GROUP BY p.assessment_year
             ),             
         
              ice AS
            (SELECT s.date_year as year,
                    AVG(s.extent) FILTER(WHERE s.region = 'Arctic') AS arctic_sea_ice_extent,
                    AVG(s.extent) FILTER(WHERE s.region = 'Antarctica') AS antarctic_sea_ice_extent
             FROM sea_ice_extent s
             WHERE s.date_year BETWEEN 2000 AND 2019
             GROUP BY s.date_year
             ),
             
              ocean AS
            (SELECT o.year as year,
                    AVG(o.tco2) AS total_co2,
                    AVG(o.phts25p0) AS ph_scale
             FROM ocean_data o
             WHERE o.year BETWEEN 2000 AND 2019
               AND o.tco2 <> -9999 AND o.phts25p0 <> -9999
             GROUP BY o.year
             ),
             
              temp AS
            (SELECT g.year as year,
                    AVG(g.global_mean) AS global_mean
             FROM global_temperature g
             WHERE g.year BETWEEN 2000 AND 2019
             GROUP BY g.year
             )
             
         SELECT pop.year, pop.population, land.percent_arable, fauna.animals_count, flora.plants_count,
                ice.arctic_sea_ice_extent, ice.antarctic_sea_ice_extent, 
                ocean.total_co2, ocean.ph_scale, temp.global_mean
         FROM pop 
         JOIN land USING (year)
         JOIN fauna USING (year)
         JOIN flora USING (year)
         JOIN ice USING (year)
         JOIN ocean USING (year)
         JOIN temp USING (year)
         ORDER BY pop.year"

env_world_df <- dbGetQuery(conn, sql)

kable_styling(kable(tail(env_world_df)), font_size = 12,
              bootstrap_options = c("striped", "hover"))
```

```{r}
kable_styling(kable(cor(env_world_df[-1])), font_size = 12,
              bootstrap_options = c("striped", "hover"))
```

```{r fig10, fig.height = 15, fig.width = 15, fig.align = "center"}
par(mfrow=c(4,2), mai = c(0.3, 0.2, 0.7, 0.2))

for(x in colnames(env_world_df)[3:ncol(env_world_df)]) {
  lfit <- loess(paste(x, "~ population"), data=env_world_df)
  
  plot(as.formula(paste(x, "~ population")), env_world_df, 
       main=paste("population and\n", gsub("_", " ", x)), cex.main=2,
       type="p", col=seaborn_palette[1], yaxt='n', xaxt='n', pch=19)
  axis(side=1, at=axTicks(1), labels=format(axTicks(1), big.mark=',', scientific=FALSE))
  axis(side=2, at=axTicks(2), labels=format(axTicks(2), big.mark=',', scientific=FALSE))
  pop_order <- order(env_world_df$population)
  lines(env_world_df$population[pop_order], lfit$fitted[pop_order], col=seaborn_palette[4], lwd=3)
}

```


```{r}
dbDisconnect(conn)
```

<div style="float:right"><img src="IMAGES/r_shiny.png"/></div>

### User Data Application

<div style="float:center"><img src="IMAGES/env_data.png" width="400px"/></div>

```{r}

# LINUX SHELL COMMAND CALL
system(paste0("gnome-terminal -- Rscript -e \"library(shiny); setwd('", getwd(), "'); runApp('EnvironmentDB_Shiny_App.R')\""))

```

<hr style="border: none; height: 1px; background-color: #CCC;"/>

<h2><span style="color: #336791"/>Conclusion</span></h2>

<div style="float:right;"><img src="IMAGES/data_pipeline.png" width="350px"/></div>
<br/>

<div style="float:left;"><img src="IMAGES/postgresql_r.png" width="100px"/></div>
<br/>
<br/>

- <h3><span style="color: #336791"/>Postgres as tool in data science workflow</h3>
- <h3><span style="color: #336791"/>Data persistence and hygiene</h3>
- <h3><span style="color: #336791"/>Centralization and security</h3>
- <h3><span style="color: #336791"/>Efficiency and usefulness</h3>
- <h3><span style="color: #336791"/>Scalability and applicability</h3>

<br/>
<br/>


